bitkeeper revision 1.1159.1.564 (420b44edsb8XzPev-TiGW16GSsCW6g)
authorkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Thu, 10 Feb 2005 11:26:37 +0000 (11:26 +0000)
committerkaf24@scramble.cl.cam.ac.uk <kaf24@scramble.cl.cam.ac.uk>
Thu, 10 Feb 2005 11:26:37 +0000 (11:26 +0000)
More x86_64 stuff. Added hypercalls to register a user-space pagetable,
modify FS/GS base addresses, and switch to user mode. User mode switches
back to kernel mode automatically on executing SYSCALL instruction.
Still todo: 1. getdomaininfo needs to include pagetable_user
            2. get writable and shadow pagetables working
            3. testing
Signed-off-by: keir.fraser@cl.cam.ac.uk
16 files changed:
xen/arch/x86/domain.c
xen/arch/x86/mm.c
xen/arch/x86/setup.c
xen/arch/x86/vmx_io.c
xen/arch/x86/vmx_vmcs.c
xen/arch/x86/x86_32/entry.S
xen/arch/x86/x86_64/entry.S
xen/arch/x86/x86_64/mm.c
xen/arch/x86/x86_64/traps.c
xen/include/asm-x86/domain.h
xen/include/asm-x86/msr.h
xen/include/asm-x86/processor.h
xen/include/asm-x86/x86_32/current.h
xen/include/asm-x86/x86_64/current.h
xen/include/public/arch-x86_64.h
xen/include/public/xen.h

index 59c9dc1e927ea4c588c477d524f6f18d8b56e873..294ed178c1018879711a461f0d7dd4c501ecde50 100644 (file)
@@ -256,6 +256,8 @@ void arch_do_createdomain(struct exec_domain *ed)
 
     SET_DEFAULT_FAST_TRAP(&ed->arch);
 
+    ed->arch.flags = TF_kernel_mode;
+
     if ( d->id == IDLE_DOMAIN_ID )
     {
         ed->arch.schedule_tail = continue_idle_task;
@@ -287,8 +289,6 @@ void arch_do_createdomain(struct exec_domain *ed)
         d->arch.mm_perdomain_l3[l3_table_offset(PERDOMAIN_VIRT_START)] = 
             mk_l3_pgentry(__pa(d->arch.mm_perdomain_l2) | __PAGE_HYPERVISOR);
 #endif
-
-        ed->arch.flags = TF_kernel_mode;
     }
 }
 
@@ -550,6 +550,172 @@ void new_thread(struct exec_domain *d,
 }
 
 
+#ifdef __x86_64__
+
+#define loadsegment(seg,value) ({               \
+    int __r = 1;                                \
+    __asm__ __volatile__ (                      \
+        "1: movl %k1,%%" #seg "\n2:\n"          \
+        ".section .fixup,\"ax\"\n"              \
+        "3: xorl %k0,%k0\n"                     \
+        "   movl %k0,%%" #seg "\n"              \
+        "   jmp 2b\n"                           \
+        ".previous\n"                           \
+        ".section __ex_table,\"a\"\n"           \
+        "   .align 8\n"                         \
+        "   .quad 1b,3b\n"                      \
+        ".previous"                             \
+        : "=r" (__r) : "r" (value), "0" (__r) );\
+    __r; })
+
+static void switch_segments(
+    struct xen_regs *regs, struct exec_domain *p, struct exec_domain *n)
+{
+    int all_segs_okay = 1;
+
+    if ( !is_idle_task(p->domain) )
+    {
+        __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (p->arch.user_ctxt.ds) );
+        __asm__ __volatile__ ( "movl %%es,%0" : "=m" (p->arch.user_ctxt.es) );
+        __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (p->arch.user_ctxt.fs) );
+        __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (p->arch.user_ctxt.gs) );
+    }
+
+    /* Either selector != 0 ==> reload. */
+    if ( unlikely(p->arch.user_ctxt.ds |
+                  n->arch.user_ctxt.ds) )
+        all_segs_okay &= loadsegment(ds, n->arch.user_ctxt.ds);
+
+    /* Either selector != 0 ==> reload. */
+    if ( unlikely(p->arch.user_ctxt.es |
+                  n->arch.user_ctxt.es) )
+        all_segs_okay &= loadsegment(es, n->arch.user_ctxt.es);
+
+    /*
+     * Either selector != 0 ==> reload.
+     * Also reload to reset FS_BASE if it was non-zero.
+     */
+    if ( unlikely(p->arch.user_ctxt.fs |
+                  p->arch.user_ctxt.fs_base |
+                  n->arch.user_ctxt.fs) )
+    {
+        all_segs_okay &= loadsegment(fs, n->arch.user_ctxt.fs);
+        if ( p->arch.user_ctxt.fs ) /* != 0 selector kills fs_base */
+            p->arch.user_ctxt.fs_base = 0;
+    }
+
+    /*
+     * Either selector != 0 ==> reload.
+     * Also reload to reset GS_BASE if it was non-zero.
+     */
+    if ( unlikely(p->arch.user_ctxt.gs |
+                  p->arch.user_ctxt.gs_base_user |
+                  n->arch.user_ctxt.gs) )
+    {
+        /* Reset GS_BASE with user %gs? */
+        if ( p->arch.user_ctxt.gs || !n->arch.user_ctxt.gs_base_user )
+            all_segs_okay &= loadsegment(gs, n->arch.user_ctxt.gs);
+        if ( p->arch.user_ctxt.gs ) /* != 0 selector kills gs_base_user */
+            p->arch.user_ctxt.gs_base_user = 0;
+    }
+
+    /* This can only be non-zero if selector is NULL. */
+    if ( n->arch.user_ctxt.fs_base )
+        wrmsr(MSR_FS_BASE,
+              n->arch.user_ctxt.fs_base,
+              n->arch.user_ctxt.fs_base>>32);
+
+    /* This can only be non-zero if selector is NULL. */
+    if ( n->arch.user_ctxt.gs_base_user )
+        wrmsr(MSR_GS_BASE,
+              n->arch.user_ctxt.gs_base_user,
+              n->arch.user_ctxt.gs_base_user>>32);
+
+    /* This can only be non-zero if selector is NULL. */
+    if ( p->arch.user_ctxt.gs_base_kernel |
+         n->arch.user_ctxt.gs_base_kernel )
+        wrmsr(MSR_SHADOW_GS_BASE,
+              n->arch.user_ctxt.gs_base_kernel,
+              n->arch.user_ctxt.gs_base_kernel>>32);
+
+    /* If in kernel mode then switch the GS bases around. */
+    if ( n->arch.flags & TF_kernel_mode )
+        __asm__ __volatile__ ( "swapgs" );
+
+    if ( unlikely(!all_segs_okay) )
+    {
+        unsigned long *rsp =
+            (n->arch.flags & TF_kernel_mode) ?
+            (unsigned long *)regs->rsp : 
+            (unsigned long *)n->arch.kernel_sp;
+
+        if ( put_user(regs->ss,     rsp- 1) |
+             put_user(regs->rsp,    rsp- 2) |
+             put_user(regs->rflags, rsp- 3) |
+             put_user(regs->cs,     rsp- 4) |
+             put_user(regs->rip,    rsp- 5) |
+             put_user(regs->gs,     rsp- 6) |
+             put_user(regs->fs,     rsp- 7) |
+             put_user(regs->es,     rsp- 8) |
+             put_user(regs->ds,     rsp- 9) |
+             put_user(regs->r11,    rsp-10) |
+             put_user(regs->rcx,    rsp-11) )
+        {
+            DPRINTK("Error while creating failsafe callback frame.\n");
+            domain_crash();
+        }
+
+        if ( !(n->arch.flags & TF_kernel_mode) )
+        {
+            n->arch.flags |= TF_kernel_mode;
+            __asm__ __volatile__ ( "swapgs" );
+            write_ptbase(n);
+        }
+
+        regs->entry_vector  = TRAP_syscall;
+        regs->rflags       &= 0xFFFCBEFFUL;
+        regs->ss            = __GUEST_SS;
+        regs->rsp           = (unsigned long)(rsp-11);
+        regs->cs            = __GUEST_CS;
+        regs->rip           = n->arch.failsafe_address;
+    }
+}
+
+long do_switch_to_user(void)
+{
+    struct xen_regs       *regs = get_execution_context();
+    struct switch_to_user  stu;
+    struct exec_domain    *ed = current;
+
+    if ( unlikely(copy_from_user(&stu, (void *)regs->rsp, sizeof(stu))) )
+        return -EFAULT;
+
+    ed->arch.flags &= ~TF_kernel_mode;
+    __asm__ __volatile__ ( "swapgs" );
+    write_ptbase(ed);
+
+    regs->rip    = stu.rip;
+    regs->cs     = stu.cs;
+    regs->rflags = stu.rflags;
+    regs->rsp    = stu.rsp;
+    regs->ss     = stu.ss;
+
+    if ( !(stu.flags & ECF_IN_SYSCALL) )
+    {
+        regs->entry_vector = 0;
+        regs->r11 = stu.r11;
+        regs->rcx = stu.rcx;
+    }
+    
+    return regs->rax;
+}
+
+#elif defined(__i386__)
+
+#define switch_segments(_r, _p, _n) ((void)0)
+
+#endif
+
 /*
  * This special macro can be used to load a debugging register
  */
@@ -566,21 +732,12 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
 #ifdef CONFIG_VMX
     unsigned long vmx_domain = next_p->arch.arch_vmx.flags; 
 #endif
-#ifdef __x86_64__
-    int all_segs_okay = 1;
-#endif
 
     __cli();
 
     /* Switch guest general-register state. */
     if ( !is_idle_task(prev_p->domain) )
     {
-#ifdef __x86_64__
-        __asm__ __volatile__ ( "movl %%ds,%0" : "=m" (stack_ec->ds) );
-        __asm__ __volatile__ ( "movl %%es,%0" : "=m" (stack_ec->es) );
-        __asm__ __volatile__ ( "movl %%fs,%0" : "=m" (stack_ec->fs) );
-        __asm__ __volatile__ ( "movl %%gs,%0" : "=m" (stack_ec->gs) );
-#endif
         memcpy(&prev_p->arch.user_ctxt,
                stack_ec, 
                sizeof(*stack_ec));
@@ -624,7 +781,7 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
         SET_FAST_TRAP(&next_p->arch);
 
 #ifdef __i386__
-        /* Switch the guest OS ring-1 stack. */
+        /* Switch the kernel ring-1 stack. */
         tss->esp1 = next_p->arch.kernel_sp;
         tss->ss1  = next_p->arch.kernel_ss;
 #endif
@@ -660,126 +817,7 @@ void switch_to(struct exec_domain *prev_p, struct exec_domain *next_p)
 
     __sti();
 
-#ifdef __x86_64__
-
-#define loadsegment(seg,value) ({               \
-    int __r = 1;                                \
-    __asm__ __volatile__ (                      \
-        "1: movl %k1,%%" #seg "\n2:\n"          \
-        ".section .fixup,\"ax\"\n"              \
-        "3: xorl %k0,%k0\n"                     \
-        "   movl %k0,%%" #seg "\n"              \
-        "   jmp 2b\n"                           \
-        ".previous\n"                           \
-        ".section __ex_table,\"a\"\n"           \
-        "   .align 8\n"                         \
-        "   .quad 1b,3b\n"                      \
-        ".previous"                             \
-        : "=r" (__r) : "r" (value), "0" (__r) );\
-    __r; })
-
-    /* Either selector != 0 ==> reload. */
-    if ( unlikely(prev_p->arch.user_ctxt.ds) ||
-         unlikely(next_p->arch.user_ctxt.ds) )
-        all_segs_okay &= loadsegment(ds, next_p->arch.user_ctxt.ds);
-
-    /* Either selector != 0 ==> reload. */
-    if ( unlikely(prev_p->arch.user_ctxt.es) ||
-         unlikely(next_p->arch.user_ctxt.es) )
-        all_segs_okay &= loadsegment(es, next_p->arch.user_ctxt.es);
-
-    /*
-     * Either selector != 0 ==> reload.
-     * Also reload to reset FS_BASE if it was non-zero.
-     */
-    if ( unlikely(prev_p->arch.user_ctxt.fs) ||
-         unlikely(prev_p->arch.user_ctxt.fs_base) ||
-         unlikely(next_p->arch.user_ctxt.fs) )
-    {
-        all_segs_okay &= loadsegment(fs, next_p->arch.user_ctxt.fs);
-        if ( prev_p->arch.user_ctxt.fs ) /* != 0 selector kills fs_base */
-            prev_p->arch.user_ctxt.fs_base = 0;
-    }
-
-    /*
-     * Either selector != 0 ==> reload.
-     * Also reload to reset GS_BASE if it was non-zero.
-     */
-    if ( unlikely(prev_p->arch.user_ctxt.gs) ||
-         unlikely(prev_p->arch.user_ctxt.gs_base_os) ||
-         unlikely(prev_p->arch.user_ctxt.gs_base_app) ||
-         unlikely(next_p->arch.user_ctxt.gs) )
-    {
-        /* Reset GS_BASE with user %gs. */
-        all_segs_okay &= loadsegment(gs, next_p->arch.user_ctxt.gs);
-        /* Reset KERNEL_GS_BASE if we won't be doing it later. */
-        if ( !next_p->arch.user_ctxt.gs_base_os )
-            wrmsr(MSR_KERNEL_GS_BASE, 0, 0);
-        if ( prev_p->arch.user_ctxt.gs ) /* != 0 selector kills app gs_base */
-            prev_p->arch.user_ctxt.gs_base_app = 0;
-    }
-
-    /* This can only be non-zero if selector is NULL. */
-    if ( next_p->arch.user_ctxt.fs_base )
-        wrmsr(MSR_FS_BASE,
-              next_p->arch.user_ctxt.fs_base,
-              next_p->arch.user_ctxt.fs_base>>32);
-
-    /* This can only be non-zero if selector is NULL. */
-    if ( next_p->arch.user_ctxt.gs_base_os )
-        wrmsr(MSR_KERNEL_GS_BASE,
-              next_p->arch.user_ctxt.gs_base_os,
-              next_p->arch.user_ctxt.gs_base_os>>32);
-
-    /* This can only be non-zero if selector is NULL. */
-    if ( next_p->arch.user_ctxt.gs_base_app )
-        wrmsr(MSR_GS_BASE,
-              next_p->arch.user_ctxt.gs_base_app,
-              next_p->arch.user_ctxt.gs_base_app>>32);
-
-    /* If in guest-OS mode, switch the GS bases around. */
-    if ( next_p->arch.flags & TF_kernel_mode )
-        __asm__ __volatile__ ( "swapgs" );
-
-    if ( unlikely(!all_segs_okay) )
-    {
-        unsigned long *rsp =
-            (next_p->arch.flags & TF_kernel_mode) ?
-            (unsigned long *)stack_ec->rsp : 
-            (unsigned long *)next_p->arch.kernel_sp;
-
-        if ( put_user(stack_ec->ss,     rsp- 1) |
-             put_user(stack_ec->rsp,    rsp- 2) |
-             put_user(stack_ec->rflags, rsp- 3) |
-             put_user(stack_ec->cs,     rsp- 4) |
-             put_user(stack_ec->rip,    rsp- 5) |
-             put_user(stack_ec->gs,     rsp- 6) |
-             put_user(stack_ec->fs,     rsp- 7) |
-             put_user(stack_ec->es,     rsp- 8) |
-             put_user(stack_ec->ds,     rsp- 9) |
-             put_user(stack_ec->r11,    rsp-10) |
-             put_user(stack_ec->rcx,    rsp-11) )
-        {
-            DPRINTK("Error while creating failsafe callback frame.\n");
-            domain_crash();
-        }
-
-        if ( !(next_p->arch.flags & TF_kernel_mode) )
-        {
-            next_p->arch.flags |= TF_kernel_mode;
-            __asm__ __volatile__ ( "swapgs" );
-            /* XXX switch page tables XXX */
-        }
-
-        stack_ec->entry_vector  = TRAP_syscall;
-        stack_ec->rflags       &= 0xFFFCBEFFUL;
-        stack_ec->ss            = __GUEST_SS;
-        stack_ec->rsp           = (unsigned long)(rsp-11);
-        stack_ec->cs            = __GUEST_CS;
-        stack_ec->rip           = next_p->arch.failsafe_address;
-    }
-
-#endif /* __x86_64__ */
+    switch_segments(stack_ec, prev_p, next_p);
 }
 
 
@@ -935,13 +973,23 @@ void domain_relinquish_memory(struct domain *d)
     /* Exit shadow mode before deconstructing final guest page table. */
     shadow_mode_disable(d);
 
-    /* Drop the in-use reference to the page-table base. */
+    /* Drop the in-use references to page-table bases. */
     for_each_exec_domain ( d, ed )
     {
         if ( pagetable_val(ed->arch.pagetable) != 0 )
-            put_page_and_type(&frame_table[pagetable_val(ed->arch.pagetable) >>
-                                           PAGE_SHIFT]);
-        ed->arch.pagetable = mk_pagetable(0);
+        {
+            put_page_and_type(
+                &frame_table[pagetable_val(ed->arch.pagetable) >> PAGE_SHIFT]);
+            ed->arch.pagetable = mk_pagetable(0);
+        }
+
+        if ( pagetable_val(ed->arch.pagetable_user) != 0 )
+        {
+            put_page_and_type(
+                &frame_table[pagetable_val(ed->arch.pagetable_user) >>
+                            PAGE_SHIFT]);
+            ed->arch.pagetable_user = mk_pagetable(0);
+        }
     }
 
 #ifdef CONFIG_VMX
index f4e11f519414c94bd70e93255bde6710b69959c5..123ff0fbda8a94a65aa8cad087c31f937670c43f 100644 (file)
@@ -209,6 +209,10 @@ void write_ptbase(struct exec_domain *ed)
 #else
     if ( unlikely(shadow_mode(d)) )
         pa = pagetable_val(ed->arch.shadow_table);    
+#ifdef __x86_64__
+    else if ( !(ed->arch.flags & TF_kernel_mode) )
+        pa = pagetable_val(ed->arch.pagetable_user);
+#endif
     else
         pa = pagetable_val(ed->arch.pagetable);
 #endif
@@ -1341,6 +1345,24 @@ static int do_extended_command(unsigned long ptr, unsigned long val)
         okay = new_guest_cr3(pfn);
         break;
         
+#ifdef __x86_64__
+    case MMUEXT_NEW_USER_BASEPTR:
+        okay = get_page_and_type_from_pagenr(pfn, PGT_root_page_table, d);
+        if ( unlikely(!okay) )
+        {
+            MEM_LOG("Error while installing new baseptr %p", pfn);
+        }
+        else
+        {
+            unsigned long old_pfn =
+                pagetable_val(ed->arch.pagetable_user) >> PAGE_SHIFT;
+            ed->arch.pagetable_user = mk_pagetable(pfn << PAGE_SHIFT);
+            if ( old_pfn != 0 )
+                put_page_and_type(&frame_table[old_pfn]);
+        }
+        break;
+#endif
+        
     case MMUEXT_TLB_FLUSH:
         percpu_info[cpu].deferred_ops |= DOP_FLUSH_TLB;
         break;
index 8c699b4586fc22b1e949ddcff0c11f843d3fe3fb..ba8e662ad412abdc970e2f95d09a859be81c1c36 100644 (file)
@@ -325,9 +325,9 @@ void __init cpu_init(void)
     memset(t->io_bitmap, ~0, sizeof(t->io_bitmap));
 #if defined(__i386__)
     t->ss0  = __HYPERVISOR_DS;
-    t->esp0 = get_stack_top();
+    t->esp0 = get_stack_bottom();
 #elif defined(__x86_64__)
-    t->rsp0 = get_stack_top();
+    t->rsp0 = get_stack_bottom();
 #endif
     set_tss_desc(nr,t);
     load_TR(nr);
index 3241ada86508cac7a7cb5d079623c7f4f1e5a075..9fbaa05298641f30cf3af185214a2657c3100d3f 100644 (file)
@@ -382,7 +382,7 @@ void vmx_do_resume(struct exec_domain *d)
 {
     __vmwrite(HOST_CR3, pagetable_val(d->arch.monitor_table));
     __vmwrite(GUEST_CR3, pagetable_val(d->arch.shadow_table));
-    __vmwrite(HOST_ESP, (unsigned long) get_stack_top());
+    __vmwrite(HOST_ESP, (unsigned long)get_stack_bottom());
 
     if (event_pending(d)) {
         if (test_bit(IOPACKET_PORT, &d->domain->shared_info->evtchn_pending[0])) 
index 07af40ca24c0415a49f5fed13e76bb9f601ecdc7..c9f1e9de1b42849bf0ec4a6060b4c7613ab19f76 100644 (file)
@@ -222,7 +222,7 @@ void vmx_do_launch(struct exec_domain *ed)
     ed->arch.shadow_table = ed->arch.pagetable;
     __vmwrite(GUEST_CR3, pagetable_val(ed->arch.pagetable));
     __vmwrite(HOST_CR3, pagetable_val(ed->arch.monitor_table));
-    __vmwrite(HOST_ESP, (unsigned long) get_stack_top());
+    __vmwrite(HOST_ESP, (unsigned long)get_stack_bottom());
 
     ed->arch.schedule_tail = arch_vmx_do_resume;
 }
index 45fec67827b611eed8a8eab9daeec4e386a6222b..3e33befc7934e9823c36bb5c8750afc7cd0905eb 100644 (file)
@@ -81,7 +81,7 @@
  *   (9)  u32 fs;
  *   (8)  u32 ds;
  *   (7)  u32 es;
- *               <- get_stack_top() (= HOST_ESP)
+ *               <- get_stack_bottom() (= HOST_ESP)
  *   (6)  u32 ss;
  *   (5)  u32 esp;
  *   (4)  u32 eflags;
@@ -89,8 +89,8 @@
  *   (2)  u32 eip;
  * (2/1)  u16 entry_vector;
  * (1/1)  u16 error_code;
- * However, get_stack_top() acturally returns 20 bytes below the real
- * top of the stack to allow space for:
+ * However, get_stack_bottom() actually returns 20 bytes before the real
+ * bottom of the stack to allow space for:
  * domain pointer, DS, ES, FS, GS. Therefore, we effectively skip 6 registers.
  */
 #define VMX_MONITOR_EFLAGS     0x202 /* IF on */
@@ -173,8 +173,8 @@ vmx_process_softirqs:
 
         ALIGN
 restore_all_guest:
-        testb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
-        jnz  failsafe_callback
+        btr  $_TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
+        j  failsafe_callback
         testl $X86_EFLAGS_VM,XREGS_eflags(%esp)
         jnz  restore_all_vm86
 FLT1:   movl XREGS_ds(%esp),%ds
@@ -216,9 +216,8 @@ FIX1:   SET_XEN_SEGMENTS(a)
 DBLFLT1:GET_CURRENT(%ebx)
         jmp   test_all_events
 DBLFIX1:GET_CURRENT(%ebx)
-        testb $TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
-        jnz   domain_crash             # cannot reenter failsafe code
-        orb   $TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
+        bts   $_TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
+        jc    domain_crash             # cannot reenter failsafe code
         jmp   test_all_events          # will return via failsafe code
 .previous
 .section __pre_ex_table,"a"
@@ -235,7 +234,6 @@ DBLFIX1:GET_CURRENT(%ebx)
 /* No special register assumptions */
 failsafe_callback:
         GET_CURRENT(%ebx)
-        andb $~TF_failsafe_return,EDOMAIN_thread_flags(%ebx)
         leal EDOMAIN_trap_bounce(%ebx),%edx
         movl EDOMAIN_failsafe_addr(%ebx),%eax
         movl %eax,TRAPBOUNCE_eip(%edx)
@@ -282,8 +280,6 @@ ENTRY(hypercall)
         GET_CURRENT(%ebx)
        andl $(NR_hypercalls-1),%eax
        call *SYMBOL_NAME(hypercall_table)(,%eax,4)
-
-ret_from_hypercall:
         movl %eax,XREGS_eax(%esp)       # save the return value
 
 test_all_events:
index ccda5d7008ac6f163f468d561833349a434b2167..8c00a685ad1a2dfb0c10ffdbc296e88d98470b61 100644 (file)
@@ -20,8 +20,8 @@
 
         ALIGN
 restore_all_guest:
-        testb $TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
-        jnz   failsafe_callback
+        btr   $_TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
+        j   failsafe_callback
         RESTORE_ALL
         testw $TRAP_syscall,4(%rsp)
         jz    1f
@@ -50,9 +50,8 @@ FIX1:   popq  -15*8-8(%rsp)            # error_code/entry_vector
 DBLFLT1:GET_CURRENT(%rbx)
         jmp   test_all_events
 DBLFIX1:GET_CURRENT(%rbx)
-        testb $TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
-        jnz   domain_crash             # cannot reenter failsafe code
-        orb   $TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
+        bts   $_TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
+        jc    domain_crash             # cannot reenter failsafe code
         jmp   test_all_events          # will return via failsafe code
 .previous
 .section __pre_ex_table,"a"
@@ -65,7 +64,6 @@ DBLFIX1:GET_CURRENT(%rbx)
 /* No special register assumptions */
 failsafe_callback:
         GET_CURRENT(%rbx)
-        andb $~TF_failsafe_return,EDOMAIN_thread_flags(%rbx)
         leaq EDOMAIN_trap_bounce(%rbx),%rdx
         movq EDOMAIN_failsafe_addr(%rbx),%rax
         movq %rax,TRAPBOUNCE_eip(%rdx)
@@ -97,8 +95,7 @@ restore_all_xen:
  * NB. We must move %r10 to %rcx for C function-calling ABI.
  */
         ALIGN
-ENTRY(hypercall)
-        sti
+ENTRY(syscall_enter)
         movl  $__GUEST_SS,8(%rsp)
         pushq %r11
         pushq $__GUEST_CS
@@ -106,13 +103,20 @@ ENTRY(hypercall)
         pushq $0
         movl  $TRAP_syscall,4(%rsp)
         SAVE_ALL
-        movq  %r10,%rcx
-        andq  $(NR_hypercalls-1),%rax
-        leaq  SYMBOL_NAME(hypercall_table)(%rip),%rbx
-        callq *(%rbx,%rax,8)
         GET_CURRENT(%rbx)
+        bts   $_TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
+        jc    hypercall
+        swapgs
+        movq  %rbx,%rdi
+        call  SYMBOL_NAME(write_ptbase)
+        jmp   restore_all_guest
 
-ret_from_hypercall:
+hypercall:
+        sti
+        movq  %r10,%rcx
+        andq  $(NR_hypercalls-1),%rax
+        leaq  SYMBOL_NAME(hypercall_table)(%rip),%r10
+        callq *(%r10,%rax,8)
         movq %rax,XREGS_rax(%rsp)       # save the return value
 
 test_all_events:
@@ -154,7 +158,7 @@ create_bounce_frame:
         movq  XREGS_rsp+8(%rsp),%rsi
         testb $TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
         jnz   1f
-        /* Push new frame at registered guest-OS stack top. */
+        /* Push new frame at registered guest-OS stack base. */
         movq  EDOMAIN_kernel_sp(%rbx),%rsi
 1:      movq  $HYPERVISOR_VIRT_START,%rax
         cmpq  %rax,%rsi
@@ -203,11 +207,11 @@ FLT15:  movq  %rax,(%rsi)               # RCX
         /* Rewrite our stack frame and return to guest-OS mode. */
         /* IA32 Ref. Vol. 3: TF, VM, RF and NT flags are cleared on trap. */
         movb  $0,TRAPBOUNCE_flags(%rdx)
-        testb $TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
-        jnz   1f
-        orb   $TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
+        bts   $_TF_kernel_mode,EDOMAIN_thread_flags(%rbx)
+        jc    1f
         swapgs
-        /* XXX switch page tables XXX */
+        movq  %rbx,%rdi
+        call  SYMBOL_NAME(write_ptbase)
 1:      movl  $TRAP_syscall,XREGS_entry_vector+8(%rsp)
         andl  $0xfffcbeff,XREGS_eflags+8(%rsp)
         movl  $__GUEST_SS,XREGS_ss+8(%rsp)
@@ -425,7 +429,7 @@ ENTRY(hypercall_table)
         .quad SYMBOL_NAME(do_set_debugreg)
         .quad SYMBOL_NAME(do_get_debugreg)
         .quad SYMBOL_NAME(do_update_descriptor)  /* 10 */
-        .quad SYMBOL_NAME(do_ni_hypercall) # do_set_fast_trap
+        .quad SYMBOL_NAME(do_ni_hypercall)
         .quad SYMBOL_NAME(do_dom_mem_op)
         .quad SYMBOL_NAME(do_multicall)
         .quad SYMBOL_NAME(do_update_va_mapping)
@@ -437,8 +441,9 @@ ENTRY(hypercall_table)
         .quad SYMBOL_NAME(do_grant_table_op)     /* 20 */
         .quad SYMBOL_NAME(do_vm_assist)
         .quad SYMBOL_NAME(do_update_va_mapping_otherdomain)
-        .quad SYMBOL_NAME(do_ni_hypercall) # do_switch_vm86
+        .quad SYMBOL_NAME(do_switch_to_user)
         .quad SYMBOL_NAME(do_boot_vcpu)
+        .quad SYMBOL_NAME(do_set_segment_base)   /* 25 */
         .rept NR_hypercalls-((.-hypercall_table)/4)
         .quad SYMBOL_NAME(do_ni_hypercall)
         .endr
index 52230df5bfa1630f2e8b09c2eabcfa21efe53bbe..f69d06a1cfdf422ab3a3c7d2bb173d43968cc9c5 100644 (file)
@@ -26,7 +26,7 @@
 #include <asm/page.h>
 #include <asm/flushtlb.h>
 #include <asm/fixmap.h>
-#include <asm/domain_page.h>
+#include <asm/msr.h>
 
 void *safe_page_alloc(void)
 {
@@ -238,6 +238,34 @@ long do_stack_switch(unsigned long ss, unsigned long esp)
     return 0;
 }
 
+long do_set_segment_base(unsigned int which, unsigned long base)
+{
+    struct exec_domain *ed = current;
+
+    switch ( which )
+    {
+    case SEGBASE_FS:
+        ed->arch.user_ctxt.fs_base = base;
+        wrmsr(MSR_FS_BASE, base, base>>32);
+        break;
+
+    case SEGBASE_GS_USER:
+        ed->arch.user_ctxt.gs_base_user = base;
+        wrmsr(MSR_SHADOW_GS_BASE, base, base>>32);
+        break;
+
+    case SEGBASE_GS_KERNEL:
+        ed->arch.user_ctxt.gs_base_kernel = base;
+        wrmsr(MSR_GS_BASE, base, base>>32);
+        break;
+
+    default:
+        return -EINVAL;
+    }
+
+    return 0;
+}
+
 
 /* Returns TRUE if given descriptor is valid for GDT or LDT. */
 int check_descriptor(struct desc_struct *d)
index 96ae0424b4d6366503782a1abae2b0e8c1c980c3..3c7c8ea7ec0f019c861651c040142070e95cb0e2 100644 (file)
@@ -153,12 +153,14 @@ asmlinkage void do_double_fault(struct xen_regs *regs)
         __asm__ __volatile__ ( "hlt" );
 }
 
-asmlinkage void hypercall(void);
+asmlinkage void syscall_enter(void);
 void __init percpu_traps_init(void)
 {
-    char *stack_top = (char *)get_stack_top();
-    char *stack     = (char *)((unsigned long)stack_top & ~(STACK_SIZE - 1));
-    int   cpu       = smp_processor_id();
+    char *stack_bottom, *stack;
+    int   cpu = smp_processor_id();
+
+    stack_bottom = (char *)get_stack_bottom();
+    stack        = (char *)((unsigned long)stack_bottom & ~(STACK_SIZE - 1));
 
     /* Double-fault handler has its own per-CPU 1kB stack. */
     init_tss[cpu].ist[0] = (unsigned long)&stack[1024];
@@ -181,17 +183,17 @@ void __init percpu_traps_init(void)
     stack[0] = 0x48;
     stack[1] = 0x89;
     stack[2] = 0x25;
-    *(u32 *)&stack[3] = (stack_top - &stack[7]) - 16;
+    *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
 
     /* leaq saversp(%rip), %rsp */
     stack[7] = 0x48;
     stack[8] = 0x8d;
     stack[9] = 0x25;
-    *(u32 *)&stack[10] = (stack_top - &stack[14]) - 16;
+    *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
 
-    /* jmp hypercall */
+    /* jmp syscall_enter */
     stack[14] = 0xe9;
-    *(u32 *)&stack[15] = (char *)hypercall - &stack[19];
+    *(u32 *)&stack[15] = (char *)syscall_enter - &stack[19];
 
     /*
      * Trampoline for SYSCALL entry from compatibility mode.
@@ -205,17 +207,17 @@ void __init percpu_traps_init(void)
     stack[0] = 0x48;
     stack[1] = 0x89;
     stack[2] = 0x25;
-    *(u32 *)&stack[3] = (stack_top - &stack[7]) - 16;
+    *(u32 *)&stack[3] = (stack_bottom - &stack[7]) - 16;
 
     /* leaq saversp(%rip), %rsp */
     stack[7] = 0x48;
     stack[8] = 0x8d;
     stack[9] = 0x25;
-    *(u32 *)&stack[10] = (stack_top - &stack[14]) - 16;
+    *(u32 *)&stack[10] = (stack_bottom - &stack[14]) - 16;
 
-    /* jmp hypercall */
+    /* jmp syscall_enter */
     stack[14] = 0xe9;
-    *(u32 *)&stack[15] = (char *)hypercall - &stack[19];
+    *(u32 *)&stack[15] = (char *)syscall_enter - &stack[19];
 
     /*
      * Common SYSCALL parameters.
index a6e5a9e5b0d15b7ce7ba45fbf80626aeb99cacdc..d8821f50e38d69ca43b874af902fba5a513c0718 100644 (file)
@@ -98,6 +98,7 @@ struct arch_exec_domain
      */
     l1_pgentry_t *perdomain_ptes;
     pagetable_t  pagetable;
+    pagetable_t  pagetable_user;  /* x86/64: user-space pagetable. */
 
     pagetable_t  monitor_table;
     pagetable_t  phys_table;            /* 1:1 pagetable */
index ea17a45aef38b7b8f189031f598c3cff1fc624f2..a7178f2e3b5e5e45bd828f1338640e7d39f2dc11 100644 (file)
@@ -63,7 +63,7 @@
 #define MSR_SYSCALL_MASK 0xc0000084    /* EFLAGS mask for syscall */
 #define MSR_FS_BASE 0xc0000100         /* 64bit GS base */
 #define MSR_GS_BASE 0xc0000101         /* 64bit FS base */
-#define MSR_KERNEL_GS_BASE  0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */ 
+#define MSR_SHADOW_GS_BASE  0xc0000102 /* SwapGS GS shadow */ 
 /* EFER bits: */ 
 #define _EFER_SCE 0  /* SYSCALL/SYSRET */
 #define _EFER_LME 8  /* Long mode enable */
index 30c6079e8523055cf3e7b07e5af50788052c316c..4e4d648480787ff1a9109131924cc2b4343da0a6 100644 (file)
 #define TBF_FAILSAFE          16
 
 /* arch_exec_domain' flags values */
-#define TF_failsafe_return     1
-#define TF_kernel_mode        2
+#define _TF_failsafe_return    0
+#define _TF_kernel_mode        1
+#define TF_failsafe_return     (1<<_TF_failsafe_return)
+#define TF_kernel_mode         (1<<_TF_kernel_mode)
 
 #ifndef __ASSEMBLY__
 
index 3c254191baf404bbdaf58d070b184ac5f1b27a96..38a3adff6116a9f375cbdd09136d7baa6d4c5a81 100644 (file)
@@ -34,11 +34,11 @@ static inline execution_context_t *get_execution_context(void)
 }
 
 /*
- * Get the top-of-stack, as stored in the per-CPU TSS. This is actually
- * 20 bytes below the real top of the stack to allow space for:
+ * Get the bottom-of-stack, as stored in the per-CPU TSS. This is actually
+ * 20 bytes before the real bottom of the stack to allow space for:
  *  domain pointer, DS, ES, FS, GS.
  */
-static inline unsigned long get_stack_top(void)
+static inline unsigned long get_stack_bottom(void)
 {
     unsigned long p;
     __asm__ ( "andl %%esp,%0; addl %2,%0" 
index efa170f77561be83cc4b1627e85bfaa23cd56760..fb5a7abc021496ff19a97c405e9e6697197132c0 100644 (file)
@@ -34,11 +34,11 @@ static inline execution_context_t *get_execution_context(void)
 }
 
 /*
- * Get the top-of-stack, as stored in the per-CPU TSS. This is actually
- * 64 bytes below the real top of the stack to allow space for:
+ * Get the bottom-of-stack, as stored in the per-CPU TSS. This is actually
+ * 64 bytes before the real bottom of the stack to allow space for:
  *  domain pointer, DS, ES, FS, GS, FS_BASE, GS_BASE_OS, GS_BASE_APP
  */
-static inline unsigned long get_stack_top(void)
+static inline unsigned long get_stack_bottom(void)
 {
     unsigned long p;
     __asm__ ( "orq %%rsp,%0; andq $~7,%0" 
index 6ebf988c3c4b2e586a77e48d8121b3699af9cc2a..3f14c3a8095a43be8165056e480f60c4eeacc0dd 100644 (file)
 #define HYPERVISOR_VIRT_END   (0xFFFF880000000000UL)
 #endif
 
+#ifndef __ASSEMBLY__
+
 /* The machine->physical mapping table starts at this address, read-only. */
 #ifndef machine_to_phys_mapping
 #define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
 #endif
 
-#ifndef __ASSEMBLY__
+/*
+ * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
+ *  @which == SEGBASE_*  ;  @base == 64-bit base address
+ * Returns 0 on success.
+ */
+#define SEGBASE_FS          0
+#define SEGBASE_GS_USER     1
+#define SEGBASE_GS_KERNEL   2
+
+/*
+ * int HYPERVISOR_switch_to_user(void)
+ *  All arguments are on the kernel stack, in the following format.
+ * Never returns if successful. Current kernel context is lost.
+ * If flags contains ECF_IN_SYSCALL:
+ *   Restore RIP, RFLAGS, RSP. 
+ *   Discard R11, RCX, CS, SS.
+ * Otherwise:
+ *   Restore R11, RCX, CS:RIP, RFLAGS, SS:RSP.
+ * All other registers are saved on hypercall entry and restored to user.
+ */
+struct switch_to_user {
+    /* Top of stack (%rsp at point of hypercall). */
+    u64 r11, rcx, flags, rip, cs, rflags, rsp, ss;
+    /* Bottom of switch_to_user stack frame. */
+} PACKED;
 
 /* NB. Both the following are 64 bits each. */
 typedef unsigned long memory_t;   /* Full-sized pointer/address/memory-size. */
@@ -136,8 +162,8 @@ typedef struct xen_regs
     u64 fs;      /* Non-zero => takes precedence over fs_base.     */
     u64 gs;      /* Non-zero => takes precedence over gs_base_app. */
     u64 fs_base;
-    u64 gs_base_os;
-    u64 gs_base_app;
+    u64 gs_base_kernel;
+    u64 gs_base_user;
 } PACKED execution_context_t;
 
 typedef u64 tsc_timestamp_t; /* RDTSC timestamp */
index c5543a2293e389ef0028b6420ebc7363ed21f144..a9222da7b613c171b143a661733339a87578b42b 100644 (file)
@@ -42,7 +42,7 @@
 #define __HYPERVISOR_set_debugreg          8
 #define __HYPERVISOR_get_debugreg          9
 #define __HYPERVISOR_update_descriptor    10
-#define __HYPERVISOR_set_fast_trap        11
+#define __HYPERVISOR_set_fast_trap        11 /* x86/32 only */
 #define __HYPERVISOR_dom_mem_op           12
 #define __HYPERVISOR_multicall            13
 #define __HYPERVISOR_update_va_mapping    14
 #define __HYPERVISOR_grant_table_op       20
 #define __HYPERVISOR_vm_assist            21
 #define __HYPERVISOR_update_va_mapping_otherdomain 22
-#define __HYPERVISOR_switch_vm86          23
+#define __HYPERVISOR_switch_vm86          23 /* x86/32 only */
+#define __HYPERVISOR_switch_to_user       23 /* x86/64 only */
 #define __HYPERVISOR_boot_vcpu            24
+#define __HYPERVISOR_set_segment_base     25 /* x86/64 only */
 
 /*
  * MULTICALLS
  *   val[7:0] == MMUEXT_NEW_BASEPTR:
  *   ptr[:2]  -- Machine address of new page-table base to install in MMU.
  * 
+ *   val[7:0] == MMUEXT_NEW_USER_BASEPTR: [x86/64 only]
+ *   ptr[:2]  -- Machine address of new page-table base to install in MMU
+ *               when in user space.
+ * 
  *   val[7:0] == MMUEXT_TLB_FLUSH:
  *   No additional arguments.
  * 
 #define MMUEXT_CLEAR_FOREIGNDOM 11
 #define MMUEXT_TRANSFER_PAGE    12 /* ptr = MA of frame; val[31:16] = dom    */
 #define MMUEXT_REASSIGN_PAGE    13
+#define MMUEXT_NEW_USER_BASEPTR 14
 #define MMUEXT_CMD_MASK        255
 #define MMUEXT_CMD_SHIFT         8